# load packages, installing if missing
if (!require(librarian)){
install.packages("librarian")
library(librarian)
}
librarian::shelf(
tidyverse, dismo, DT, here, htmltools, leaflet, mapview, raster, rgbif,
rgdal, rJava, sdmpredictors, sf, spocc, geojsonio, GGally, mgcv, maptools,
caret, # m: modeling framework
pdp, # X: partial dependence plots
ranger, # m: random forest modeling
rpart, # m: recursive partition modeling
rpart.plot, # m: recursive partition plotting
rsample, # d: split train/test data
skimr, # d: skim summarize data table
vip, # X: variable importance
dismo, # species distribution modeling: maxent(), predict(), evaluate(),
usdm # uncertainty analysis for species distribution models: vifcor()
)
select <- dplyr::select # overwrite raster::select
options(readr.show_col_type = FALSE, scipen = 999)
# set random seed for reproducibility
set.seed(42)
# graphical theme
ggplot2::theme_set(ggplot2::theme_light())
# directory to store data
dir_data <- here("data/sdm")
dir.create(dir_data, showWarnings = F, recursive = TRUE)
# file paths
obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
dir_env <- file.path(dir_data, "env")
obs_hull_geo <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
mdl_maxv_rds <- file.path(dir_data, "mdl_maxent_vif.rds")
# logical, redo full calculations within if statements or not
redo <- FALSE
Species: Red-tailed Hawk (Buteo jamaicensis)
Total observations after creating bounding box around habitat area and removing duplicate geometries.
if (!file.exists(obs_geo) | redo){
# get species occurrence data from GBIF with coordinates
(res <- spocc::occ(
query = 'Buteo jamaicensis',
from = 'gbif',
has_coords = T,
limit = 10000))
# extract data frame from result
df <- res$gbif$data[[1]]
readr::write_csv(df, obs_csv)
# convert to points of observation from lon/lat columns in data frame
obs <- df %>%
# limit observations to bounding box around north and central america
filter(between(longitude, -167.593385, -51.171266),
between(latitude, 5.645215, 71.374349)) %>%
sf::st_as_sf(
coords = c("longitude", "latitude"),
crs = st_crs(4326)) %>%
select(prov, key) %>% # save space (joinable from obs_csv)
distinct(geometry, .keep_all = TRUE) # remove duplicate geometries
sf::write_sf(obs, obs_geo, delete_dsn = T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 9376
# show points on map
mapview::mapview(obs, map.types = "Esri.WorldStreetMap")
Explore environmental data options
# set a default data directory
options(sdmpredictors_datadir = dir_env)
# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)
# show table of datasets
env_datasets %>%
select(dataset_code, description, citation) %>%
DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")
# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)
Choose environmental data layers
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", # altitude
"WC_bio1", # annual mean temperature
"WC_bio2", # mean diurnal temperature range
"ER_tri", # terrain roughness
"WC_bio12") # annual precipitation
# get layers
env_stack <- load_layers(env_layers_vec)
# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc = 2)
Create species range
if (!file.exists(obs_hull_geo) | redo){
# make convex hull around points of observation
obs_hull <- sf::st_convex_hull(st_union(obs))
# save obs hull
write_sf(obs_hull, obs_hull_geo)
}
obs_hull <- read_sf(obs_hull_geo)
# show points on map
mapview(list(obs, obs_hull))
if (!file.exists(env_stack_grd) | redo){
obs_hull_sp <- sf::as_Spatial(obs_hull)
env_stack <- raster::mask(env_stack, obs_hull_sp) %>%
raster::crop(extent(obs_hull_sp))
writeRaster(env_stack, env_stack_grd, overwrite = T)
}
env_stack <- stack(env_stack_grd)
# show map
plot(env_stack, nc = 2)
if (!file.exists(absence_geo) | redo){
# get raster count of observations
r_obs <- rasterize(
sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
# create mask for
r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
# generate random points inside mask
absence <- dismo::randomPoints(r_mask, nrow(obs)) %>%
as_tibble() %>%
st_as_sf(coords = c("x", "y"), crs = 4326)
write_sf(absence, absence_geo, delete_dsn = T)
}
absence <- read_sf(absence_geo)
# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") +
mapview(absence, col.regions = "gray")
if (!file.exists(pts_env_csv) | redo){
# combine presence and absence into single set of labeled points
pts <- rbind(
obs %>%
mutate(present = 1) %>%
select(present, key),
absence %>% mutate(present = 0,
key = NA)) %>%
mutate(ID = 1:n()) %>%
relocate(ID)
write_sf(pts, pts_geo, delete_dsn = TRUE)
# extract raster values for points
pts_env <- raster::extract(env_stack, as_Spatial(pts), df = TRUE) %>%
tibble() %>%
# join present and geometry columns to raster value results for points
left_join(pts %>% select(ID, present),
by = "ID") %>%
relocate(present, .after = ID) %>%
# extract lon, lat as single columns
mutate(lon = st_coordinates(geometry)[,1],
lat = st_coordinates(geometry)[,2]) %>%
select(-geometry)
write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)
pts_env %>%
# show first 10 presence, last 10 absence
slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>%
DT::datatable(rownames = FALSE,
options = list(dom = "t",
pageLength = 20))
pts_env %>%
select(-ID) %>%
mutate(present = factor(present)) %>%
pivot_longer(-present) %>%
ggplot() +
geom_density(aes(x = value, fill = present)) +
scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
theme_bw() +
facet_wrap(~name, scales = "free") +
theme(legend.position = c(1, 0),
legend.justification = c(1, 0))
Question 1: There were a total of 7,004,451 observations in GBIF for the Red-tailed hawk (Buteo jamaicensis) as of 2022-01-24.
Question 2: There was one point from the original dataset that was in the ocean near Europe. To fix this issues, I created a bounding box around the parts of North and Central America and used this bbox to exclude any locations outside of the Red-tailed Hawks habitat.
Question 3: The environmental layers that I choose include: altitude, annual mean temperature, mean diurnal temperature range, terrain roughness, and annual precipitation. In the literature I found, other studies used: maximum and minimum temperatures, precipitation, solar radiation, wind speed, and cloud cover as abiotic factors. I felt that the first two variables from the literature were covered by annual mean temperature, mean diurnal temperature range, and annual precipitation. The other variables were not present in the available layers, so I did not include them.
GGally::ggpairs(
select(pts_env, -ID),
aes(color = factor(present), alpha = 0.5))
Set up data
# setup model data
model_data <- pts_env %>%
select(-ID) %>% # remove terms we don't want to model
tidyr::drop_na() # drop rows with NA values
nrow(model_data)
## [1] 18682
# fit a linear model
mdl_lm <- lm(present ~ ., data = model_data) # . means all other columns (X)
summary(mdl_lm)
##
## Call:
## lm(formula = present ~ ., data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.13102 -0.41811 0.05709 0.41161 1.22059
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.46989444 0.07840354 5.993 0.000000002094 ***
## WC_alt -0.00010576 0.00001235 -8.564 < 0.0000000000000002 ***
## WC_bio1 0.03177874 0.00218006 14.577 < 0.0000000000000002 ***
## WC_bio2 -0.04752572 0.00216064 -21.996 < 0.0000000000000002 ***
## ER_tri -0.00029041 0.00013004 -2.233 0.0255 *
## WC_bio12 -0.00027986 0.00001037 -26.980 < 0.0000000000000002 ***
## lon -0.00149905 0.00030980 -4.839 0.000001316915 ***
## lat 0.01003139 0.00152964 6.558 0.000000000056 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.452 on 18674 degrees of freedom
## Multiple R-squared: 0.1832, Adjusted R-squared: 0.1828
## F-statistic: 598.1 on 7 and 18674 DF, p-value: < 0.00000000000000022
y_predict_lm <- predict(mdl_lm, model_data, type = "response")
y_true <- pts_env$present
range(y_predict_lm)
## [1] -0.5917617 1.1310212
range(y_true)
## [1] 0 1
# fit a generalized linear model with a binomial logit link function
mdl_glm <- glm(present ~ .,
family = binomial(link = "logit"),
data = model_data)
summary(mdl_glm)
##
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"),
## data = model_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5181 -1.0147 0.3938 0.9893 2.7853
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.32497197 0.40542651 -0.802 0.422810
## WC_alt -0.00056103 0.00006311 -8.889 < 0.0000000000000002 ***
## WC_bio1 0.16329620 0.01122489 14.548 < 0.0000000000000002 ***
## WC_bio2 -0.23208339 0.01141338 -20.334 < 0.0000000000000002 ***
## ER_tri -0.00136951 0.00066749 -2.052 0.040195 *
## WC_bio12 -0.00143635 0.00005592 -25.685 < 0.0000000000000002 ***
## lon -0.00562434 0.00155676 -3.613 0.000303 ***
## lat 0.05724220 0.00797420 7.178 0.000000000000705 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25899 on 18681 degrees of freedom
## Residual deviance: 22069 on 18674 degrees of freedom
## AIC: 22085
##
## Number of Fisher Scoring iterations: 4
y_predict_glm <- predict(mdl_glm, model_data, type = "response")
range(y_predict_glm)
## [1] 0.002979517 0.958016253
termplot(mdl_glm, partial.resid = TRUE, se = TRUE, main = F)
# fit a generalized additive model with smooth predictors
mdl_gam <- mgcv::gam(
formula = present ~ s(WC_alt) + s(WC_bio1) +
s(WC_bio2) + s(ER_tri) + s(WC_bio12) + s(lon) + s(lat),
family = binomial, data = model_data)
summary(mdl_gam)
##
## Family: binomial
## Link function: logit
##
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(WC_bio12) +
## s(lon) + s(lat)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.38608 0.08701 -4.437 0.00000911 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(WC_alt) 8.836 8.987 264.9 <0.0000000000000002 ***
## s(WC_bio1) 8.963 8.999 1255.3 <0.0000000000000002 ***
## s(WC_bio2) 7.613 8.306 658.3 <0.0000000000000002 ***
## s(ER_tri) 7.698 8.134 130.4 <0.0000000000000002 ***
## s(WC_bio12) 6.348 7.041 650.7 <0.0000000000000002 ***
## s(lon) 8.784 8.974 596.4 <0.0000000000000002 ***
## s(lat) 8.845 8.993 302.1 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.519 Deviance explained = 45.7%
## UBRE = -0.24079 Scale est. = 1 n = 18682
plot(mdl_gam, scale = 0)
mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds") # create new file path
# show version of maxent
if (!interactive())
maxent()
## This is MaxEnt version 3.4.3
# plot environmental rasters
plot(env_stack, nc = 2)
# get presence-only observation points (maxent extracts raster values for you)
obs_sp <- sf::as_Spatial(obs) # maxent prefers sp::SpatialPoints over newer sf::sf class
# fit a maximum entropy model
if (!file.exists(mdl_maxent_rds) | redo){
mdl <- maxent(env_stack, obs_sp)
readr::write_rds(mdl, mdl_maxent_rds)
}
mdl_maxent <- read_rds(mdl_maxent_rds)
# plot variable contributions per predictor
plot(mdl_maxent)
# plot term plots
response(mdl_maxent)
y_predict_maxent <- predict(env_stack, mdl_maxent)
plot(y_predict_maxent, main = 'Maxent, raw prediction')
data(wrld_simpl, package = "maptools")
plot(wrld_simpl, add = TRUE, border='dark grey')
Question 4: There are two environment variables that seem to contribute most towards presence. For altitude (WC_atl), below about 250 ft of altitude there is a higher chance of presence with a high confidence range. Where there is a diurnal temperature range (WC_bio2) between about 7 and 12 degree there is a higher chance of species presence. Additionally, between about -120 and -100 longitudinal degrees there is a higher chance of species presence.
Question 5: There are four environment variables that seem to contribute most towards presence for the maxent model. The first two, altitude (WC_alt) and diurnal temperature range (WC_bio2) were similar to the GAM results. For altitude, below about 500 ft of altitude there is a high chance of presence. Where there is a diurnal temperature range between about 5 and 15 degree there is a high chance of species presence. The other two variables (mean annual temperature and annual precipitation) had values that seems to predict species presence that did not show up in the GAM results. When the mean annual temperature (WC_bio1) is between 10 and 20 degrees there is a high chance of species presence. And when the annual precipitation (WC_bio12) is below 1,000 there is a high chance of species presence.
Set up data
decision_tree_data <- pts_env %>%
select(-ID) %>% # not used as a predictor x
mutate(present = factor(present)) %>% # categorical response
na.omit() # drop rows with NA
skim(decision_tree_data)
| Name | decision_tree_data |
| Number of rows | 18682 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| present | 0 | 1 | FALSE | 2 | 1: 9344, 0: 9338 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| WC_alt | 0 | 1 | 555.52 | 640.74 | -74.00 | 99.00 | 275.00 | 792.00 | 3813.00 | ▇▂▁▁▁ |
| WC_bio1 | 0 | 1 | 12.30 | 6.62 | -5.70 | 7.70 | 12.30 | 17.30 | 29.00 | ▂▅▇▇▂ |
| WC_bio2 | 0 | 1 | 12.72 | 2.50 | 4.10 | 11.10 | 12.40 | 14.50 | 20.90 | ▁▃▇▃▁ |
| ER_tri | 0 | 1 | 25.77 | 34.44 | 0.00 | 5.28 | 11.42 | 32.08 | 272.13 | ▇▁▁▁▁ |
| WC_bio12 | 0 | 1 | 792.04 | 458.62 | 49.00 | 435.00 | 755.00 | 1066.00 | 4827.00 | ▇▃▁▁▁ |
| lon | 0 | 1 | -100.60 | 16.63 | -135.29 | -117.06 | -100.62 | -86.80 | -60.04 | ▅▇▇▇▂ |
| lat | 0 | 1 | 37.97 | 9.00 | 9.21 | 33.10 | 38.53 | 43.78 | 60.46 | ▁▂▇▇▂ |
# create training set with 80% of full data
d_split <- rsample::initial_split(decision_tree_data,
prop = 0.8,
strata = "present")
d_train <- rsample::training(d_split)
# show number of rows present is 0 vs 1 (all data)
table(decision_tree_data$present)
##
## 0 1
## 9338 9344
# show number of rows present is 0 vs 1 (training data)
table(d_train$present)
##
## 0 1
## 7470 7475
# run decision stump model
mdl_dt1 <- rpart(present ~ .,
data = d_train,
control = list(cp = 0,
minbucket = 5,
maxdepth = 1))
mdl_dt1
## n= 14945
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 14945 7470 1 (0.49983272 0.50016728)
## 2) WC_bio1< 6.35 2756 181 0 (0.93432511 0.06567489) *
## 3) WC_bio1>=6.35 12189 4895 1 (0.40159160 0.59840840) *
# plot tree
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl_dt1)
# decision tree with defaults
mdl_dt2 <- rpart(present ~ ., data = d_train)
mdl_dt2
## n= 14945
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 14945 7470 1 (0.49983272 0.50016728)
## 2) WC_bio1< 6.35 2756 181 0 (0.93432511 0.06567489) *
## 3) WC_bio1>=6.35 12189 4895 1 (0.40159160 0.59840840)
## 6) lon>=-116.1268 8654 4206 0 (0.51398197 0.48601803)
## 12) lon< -80.37332 6710 2693 0 (0.59865872 0.40134128)
## 24) WC_bio12>=1342.5 825 99 0 (0.88000000 0.12000000) *
## 25) WC_bio12< 1342.5 5885 2594 0 (0.55921835 0.44078165)
## 50) WC_bio2>=12.85 3423 1123 0 (0.67192521 0.32807479) *
## 51) WC_bio2< 12.85 2462 991 1 (0.40251828 0.59748172)
## 102) lon>=-96.47691 1777 877 1 (0.49352842 0.50647158)
## 204) lat< 28.29523 192 29 0 (0.84895833 0.15104167) *
## 205) lat>=28.29523 1585 714 1 (0.45047319 0.54952681)
## 410) lon< -89.61047 531 194 0 (0.63465160 0.36534840) *
## 411) lon>=-89.61047 1054 377 1 (0.35768501 0.64231499) *
## 103) lon< -96.47691 685 114 1 (0.16642336 0.83357664) *
## 13) lon>=-80.37332 1944 431 1 (0.22170782 0.77829218)
## 26) lat< 38.57665 347 108 0 (0.68876081 0.31123919) *
## 27) lat>=38.57665 1597 192 1 (0.12022542 0.87977458) *
## 7) lon< -116.1268 3535 447 1 (0.12644979 0.87355021) *
rpart.plot(mdl_dt2)
# plot complexity parameter
plotcp(mdl_dt2)
# rpart cross validation results
mdl_dt2$cptable
## CP nsplit rel error xerror xstd
## 1 0.32048193 0 1.0000000 1.0279786 0.008179589
## 2 0.08862115 1 0.6795181 0.6815261 0.007756026
## 3 0.03212851 3 0.5022758 0.5062918 0.007115127
## 4 0.01753681 5 0.4380187 0.4420348 0.006789729
## 5 0.01236055 6 0.4204819 0.4242303 0.006689466
## 6 0.01000000 9 0.3834003 0.4000000 0.006545350
# caret cross validation results
mdl_caret <- train(present ~ .,
data = d_train,
method = "rpart",
trControl = trainControl(method = "cv", number = 10),
tuneLength = 20)
ggplot(mdl_caret)
vip(mdl_caret, num_features = 40, bar = FALSE)
Partial dependence plots
# Construct partial dependence plots
partial_dependency1 <- partial(mdl_caret, pred.var = "lat") %>% autoplot()
partial_dependency2 <- partial(mdl_caret, pred.var = "WC_bio2") %>% autoplot()
partial_dependency3 <- partial(mdl_caret, pred.var = c("lat", "WC_bio2")) %>%
plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE,
colorkey = TRUE, screen = list(z = -20, x = -60))
# Display plots side by side
gridExtra::grid.arrange(partial_dependency1, partial_dependency2, partial_dependency3, ncol = 3)
# number of features
n_features <- length(setdiff(names(d_train), "present"))
# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)
# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.2908209
# re-run model with impurity-based variable importance
mdl_impurity <- ranger(present ~ .,
data = d_train,
importance = "impurity")
# re-run model with permutation-based variable importance
mdl_permutation <- ranger(present ~ .,
data = d_train,
importance = "permutation")
p_impurity <- vip::vip(mdl_impurity, bar = FALSE)
p_permutation <- vip::vip(mdl_permutation, bar = FALSE)
gridExtra::grid.arrange(p_impurity, p_permutation, nrow = 1)
Question 6: It is recommended that we use a tree of size 5.
Question 7: The top three most important variables for my model are: latitude, annual mean temperature (WC_bio1), and altitude (WC_alt).
Question 8: In the permutations importance plot for the RandomForest model the most important variables are longitude, latitude, and mean annual temperature (WC_bio1). This is different from the Rpart model, where the important variables were latitude, mean annual temperature (WC_bio1), and altitude (WC_alt).
Split observations into training and testing
pts <- read_sf(pts_geo)
# create training set with 80% of full data
pts_split <- rsample::initial_split(pts,
prop = 0.8,
strata = "present")
pts_train <- rsample::training(pts_split)
pts_test <- rsample::testing(pts_split)
pts_train_p <- pts_train %>%
filter(present == 1) %>%
as_Spatial()
pts_train_a <- pts_train %>%
filter(present == 0) %>%
as_Spatial()
# show pairs plot before multicollinearity reduction with vifcor()
pairs(env_stack)
# calculate variance inflation factor (VIF) per predictor, a metric of multicollinearity between variables
vif(env_stack)
## Variables VIF
## 1 WC_alt 2.972943
## 2 WC_bio1 1.732601
## 3 WC_bio2 3.177039
## 4 ER_tri 1.874529
## 5 WC_bio12 2.282351
# stepwise reduce predictors, based on a max correlation of 0.7 (max 1)
v <- vifcor(env_stack, th = 0.7)
v
## No variable from the 5 input variables has collinearity problem.
##
## The linear correlation coefficients ranges between:
## min correlation ( ER_tri ~ WC_bio1 ): -0.1380049
## max correlation ( WC_bio12 ~ WC_bio2 ): -0.589015
##
## ---------- VIFs of the remained variables --------
## Variables VIF
## 1 WC_alt 2.729156
## 2 WC_bio1 1.707131
## 3 WC_bio2 3.111583
## 4 ER_tri 1.836237
## 5 WC_bio12 2.201158
# reduce enviromental raster stack by
env_stack_v <- usdm::exclude(env_stack, v)
# show pairs plot after multicollinearity reduction with vifcor()
pairs(env_stack_v)
# fit a maximum entropy model
if (!file.exists(mdl_maxv_rds) | redo){
mdl_maxv <- maxent(env_stack_v, sf::as_Spatial(pts_train))
readr::write_rds(mdl_maxv, mdl_maxv_rds)
}
mdl_maxv <- read_rds(mdl_maxv_rds, sf::as_Spatial(pts_train))
# plot variable contributions per predictor
plot(mdl_maxv)
# plot term plots
response(mdl_maxv)
# predict
y_maxv <- predict(env_stack, mdl_maxv)
plot(y_maxv, main = 'Maxent, raw prediction')
data(wrld_simpl, package = "maptools")
plot(wrld_simpl, add = TRUE, border = 'dark grey')
Reciever Operater Characteristic (ROC) Curve
pts_test_p <- pts_test %>%
filter(present == 1) %>%
as_Spatial()
pts_test_a <- pts_test %>%
filter(present == 0) %>%
as_Spatial()
y_maxv <- predict(mdl_maxv, env_stack)
#plot(y_maxv)
e <- dismo::evaluate(
p = pts_test_p,
a = pts_test_a,
model = mdl_maxv,
x = env_stack)
e
## class : ModelEvaluation
## n presences : 1873
## n absences : 1868
## AUC : 0.8747603
## cor : 0.636749
## max TPR+TNR at : 0.6642459
plot(e, 'ROC')
thr <- threshold(e)[['spec_sens']]
thr
## [1] 0.6642459
p_true <- na.omit(raster::extract(y_maxv, pts_test_p) >= thr)
a_true <- na.omit(raster::extract(y_maxv, pts_test_a) < thr)
# (t)rue/(f)alse (p)ositive/(n)egative rates
tpr <- sum(p_true)/length(p_true)
fnr <- sum(!p_true)/length(p_true)
fpr <- sum(!a_true)/length(a_true)
tnr <- sum(a_true)/length(a_true)
matrix(
c(tpr, fnr,
fpr, tnr),
nrow = 2, dimnames = list(
c("present_obs", "absent_obs"),
c("present_pred", "absent_pred")))
## present_pred absent_pred
## present_obs 0.72504 0.1381156
## absent_obs 0.27496 0.8618844
Area Under the Curve (AUC)
plot(e, 'ROC')
# add point to ROC plot
points(fpr, tpr, pch = 23, bg = "blue") ## BREAKS HERE
plot(y_maxv > thr)
Question 9: No variables were removed due to multicollinearity as the highest correllation between variables is 0.6 for mean diurnal temperature range (WC_bio2) and annual precipiation (WC_bio12). The variables ranked in importance from most to least are:
- Annual mean temperature (WC_bio1)
- Altitude (WC_alt)
- Annual precipitation (WC_bio12)
- Terrain roughness (ER_tri)
- Mean Diurnal temperature range (WC_bio2)